Cluster Analysis using K-means method

We are going to use K-means method to cluster the groups with similar characteristics.

The data we used is from Statistics Sweden(SCB). The analysis is used for my master thesis named "The Impact of Swedish Public Finance Factors on the Local Real Estate Market — Based on the GMM PVAR Approach".

In [1]:
import osmnx as ox, geopandas as gpd
%matplotlib inline
ox.config(log_console=True, use_cache=True)
In [2]:
link = './gadm36_SWE_shp/gadm36_SWE_2.dbf'
shape = gpd.read_file(link)
type(shape)
Out[2]:
geopandas.geodataframe.GeoDataFrame
In [3]:
%matplotlib inline

import seaborn as sns
import pandas as pd
from pysal.lib import weights
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pandas.plotting import scatter_matrix

import seaborn as sns
In [4]:
ave = pd.read_csv("./22_cluster.csv",header = 0)
In [5]:
ave.head()
Out[5]:
Municipality ave_tax ave_pop_den ave_edu_rate ave_inmi_rate ave_price ave_employ_rate ave_dipo_inc
0 Upplands Väsby 0.137091 0.095355 0.371550 0.134272 0.288052 0.640304 0.147296
1 Vallentuna 0.114545 0.014934 0.428633 0.065944 0.271150 0.723146 0.213905
2 Österåker 0.061939 0.022535 0.413993 0.087037 0.275195 0.665232 0.235432
3 Värmdö 0.157030 0.015443 0.414915 0.068368 0.316443 0.715718 0.227583
4 Järfälla 0.114848 0.225892 0.461354 0.138136 0.305131 0.553390 0.153717
In [6]:
ave.rename(columns = {'Municipality': 'NAME_2'},
           inplace = True)
In [7]:
join = pd.merge(ave,shape,how = 'left',on = ['NAME_2'])
In [8]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mapclassify
In [9]:
from geopandas import GeoDataFrame
In [10]:
ave = GeoDataFrame(ave)
In [11]:
attribute = ["ave_tax","ave_edu_rate","ave_dipo_inc","ave_pop_den","ave_employ_rate","ave_inmi_rate"]
In [12]:
join.crs = shape.crs
In [13]:
from geopandas import GeoDataFrame

join = GeoDataFrame(join)
In [14]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format ='retina'

import seaborn as sns
import geopandas as gpd
import palettable as pltt
from seaborn import palplot

import osmnx as ox, geopandas as gpd
ox.config(log_console=True, use_cache=True)
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# For adding basemap
import contextily as cx

from sklearn import preprocessing 
from sklearn import cluster
In [15]:
def choro():  
    f, axs = plt.subplots(nrows = 2, ncols = 3, figsize=(30,30)) # Set figure layout
    axs = axs.flatten()
    for i, columns in enumerate(attribute):
        ax = axs[i]
        join.plot(column=columns, scheme='fisher_jenks', k=6, 
                cmap=plt.cm.Blues, alpha=0.8, ax=ax, 
                edgecolor='w', linewidth=0.4) # Plot blue choropleth, set figure appearance parameters

        f.suptitle('Choropleth of \n' + columns, fontsize='x-large') # Give a title, set font size and location

        ax.set_axis_off()
        ax.set_title(columns)
        plt.axis('equal') # Adjust the figure

        cx.add_basemap(ax, crs=join.crs); # Add basemap

    f.tight_layout()
    plt.savefig('choro.png',bbox_inches='tight')
    plt.show()
In [16]:
choro()

We choose to test for the cluster results when k=3,4,5,6.

In [17]:
def choro_cluster(columns, names):
    f, ax = plt.subplots(1, figsize = (8,8))
    join.plot(column = columns, categorical = True, cmap=plt.cm.Spectral, legend = True, linewidth = 0, alpha = 0.75, ax = ax)
    ax.set_axis_off()
    plt.axis('equal')
    cx.add_basemap(ax, crs=join.crs)
    plt.title(names)
    plt.savefig(f'choro_cluster_{names}.png',bbox_inches='tight')
    plt.show()
In [18]:
kmeans3 = cluster.KMeans(n_clusters = 3)

np.random.seed(12)
k3cls = kmeans3.fit(join[attribute])
In [19]:
k3cls.labels_
Out[19]:
array([2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 1, 1, 1, 1, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 2, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 2, 0, 2, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 2, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 2, 2, 2, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 2, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 2, 2, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 2,
       1, 1, 0, 1], dtype=int32)
In [20]:
join['k3cls'] = k3cls.labels_
In [21]:
choro_cluster('k3cls','K=3')
In [22]:
# Name (index) the rows after the category they belong
to_plot = join.set_index('k3cls')
# Subset to keep only variables used in K-means clustering
to_plot = to_plot[attribute]

to_plot = to_plot.stack()
to_plot = to_plot.reset_index()
to_plot = to_plot.rename(columns={'level_1': 'Attribute', 0: 'Values'})

# Display top of the table
to_plot.head()
Out[22]:
k3cls Attribute Values
0 2 ave_tax 0.137091
1 2 ave_edu_rate 0.371550
2 2 ave_dipo_inc 0.147296
3 2 ave_pop_den 0.095355
4 2 ave_employ_rate 0.640304
In [23]:
# Setup the facets
facets = sns.FacetGrid(data=to_plot, row='Attribute', hue='k3cls', sharey=False, sharex=False, aspect=2)
_ = facets.map(sns.kdeplot, 'Values', shade=True).add_legend()
In [24]:
kmeans4 = cluster.KMeans(n_clusters = 4)

np.random.seed(123)
k4cls = kmeans4.fit(join[attribute])
In [25]:
k4cls.labels_
Out[25]:
array([3, 3, 3, 3, 3, 3, 3, 0, 3, 1, 3, 1, 1, 3, 3, 3, 2, 0, 3, 2, 2, 3,
       3, 1, 1, 1, 1, 0, 3, 1, 0, 3, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 3, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 3, 1, 0, 0, 1, 0, 0, 0, 3, 0, 1, 0, 1,
       0, 1, 0, 3, 0, 0, 0, 1, 3, 0, 3, 0, 0, 0, 3, 3, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 3, 3, 3, 0, 3, 3, 1, 1, 1, 1, 0, 1, 0, 0, 3, 1, 1,
       1, 3, 3, 3, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 3, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 3, 3, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 3, 1, 1, 1, 1, 0, 0, 0, 3, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 3, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 3, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 3,
       1, 1, 0, 1], dtype=int32)
In [26]:
join['k4cls'] = k4cls.labels_
In [27]:
# Name (index) the rows after the category they belong
to_plot = join.set_index('k4cls')
# Subset to keep only variables used in K-means clustering
to_plot = to_plot[attribute]

to_plot = to_plot.stack()
to_plot = to_plot.reset_index()
to_plot = to_plot.rename(columns={'level_1': 'Attribute', 0: 'Values'})

# Display top of the table
to_plot.head()
Out[27]:
k4cls Attribute Values
0 3 ave_tax 0.137091
1 3 ave_edu_rate 0.371550
2 3 ave_dipo_inc 0.147296
3 3 ave_pop_den 0.095355
4 3 ave_employ_rate 0.640304
In [28]:
# Setup the facets
facets = sns.FacetGrid(data=to_plot, row='Attribute', hue='k4cls', sharey=False, sharex=False, aspect=2)
_ = facets.map(sns.kdeplot, 'Values', shade=True).add_legend()

The groups overlapped very much so this is not an ideal result.

In [29]:
kmeans5 = cluster.KMeans(n_clusters = 5)

np.random.seed(1234)
k5cls = kmeans5.fit(join[attribute])
In [30]:
join['k5cls'] = k5cls.labels_
In [31]:
# Name (index) the rows after the category they belong
to_plot = join.set_index('k5cls')
# Subset to keep only variables used in K-means clustering
to_plot = to_plot[attribute]

to_plot = to_plot.stack()
to_plot = to_plot.reset_index()
to_plot = to_plot.rename(columns={'level_1': 'Attribute', 0: 'Values'})

# Display top of the table
to_plot.head()
Out[31]:
k5cls Attribute Values
0 1 ave_tax 0.137091
1 1 ave_edu_rate 0.371550
2 1 ave_dipo_inc 0.147296
3 1 ave_pop_den 0.095355
4 1 ave_employ_rate 0.640304
In [32]:
# Name (index) the rows after the category they belong
to_plot = join.set_index('k5cls')
# Subset to keep only variables used in K-means clustering
to_plot = to_plot[attribute]

to_plot = to_plot.stack()
to_plot = to_plot.reset_index()
to_plot = to_plot.rename(columns={'level_1': 'Attribute', 0: 'Values'})

# Display top of the table
to_plot.head()
Out[32]:
k5cls Attribute Values
0 1 ave_tax 0.137091
1 1 ave_edu_rate 0.371550
2 1 ave_dipo_inc 0.147296
3 1 ave_pop_den 0.095355
4 1 ave_employ_rate 0.640304
In [33]:
# Setup the facets
facets = sns.FacetGrid(data=to_plot, row='Attribute', hue='k5cls', sharey=False, sharex=False, aspect=2)
_ = facets.map(sns.kdeplot, 'Values', shade=True).add_legend()
In [34]:
kmeans7 = cluster.KMeans(n_clusters = 7)

np.random.seed(123456)
k7cls = kmeans7.fit(join[attribute])
In [35]:
join['k7cls'] = k7cls.labels_
In [36]:
# Name (index) the rows after the category they belong
to_plot = join.set_index('k7cls')
# Subset to keep only variables used in K-means clustering
to_plot = to_plot[attribute]

to_plot = to_plot.stack()
to_plot = to_plot.reset_index()
to_plot = to_plot.rename(columns={'level_1': 'Attribute', 0: 'Values'})

# Display top of the table
to_plot.head()
Out[36]:
k7cls Attribute Values
0 3 ave_tax 0.137091
1 3 ave_edu_rate 0.371550
2 3 ave_dipo_inc 0.147296
3 3 ave_pop_den 0.095355
4 3 ave_employ_rate 0.640304
In [37]:
# Setup the facets
facets = sns.FacetGrid(data=to_plot, row='Attribute', hue='k7cls', sharey=False, sharex=False, aspect=2)
_ = facets.map(sns.kdeplot, 'Values', shade=True).add_legend()
In [38]:
kmeans6 = cluster.KMeans(n_clusters = 6)

np.random.seed(12345)
k6cls = kmeans6.fit(join[attribute])
In [39]:
join['k6cls'] = k6cls.labels_
In [40]:
# Name (index) the rows after the category they belong
to_plot = join.set_index('k6cls')
# Subset to keep only variables used in K-means clustering
to_plot = to_plot[attribute]

to_plot = to_plot.stack()
to_plot = to_plot.reset_index()
to_plot = to_plot.rename(columns={'level_1': 'Attribute', 0: 'Values'})

# Display top of the table
to_plot.head()
Out[40]:
k6cls Attribute Values
0 2 ave_tax 0.137091
1 2 ave_edu_rate 0.371550
2 2 ave_dipo_inc 0.147296
3 2 ave_pop_den 0.095355
4 2 ave_employ_rate 0.640304
In [41]:
# Setup the facets
facets = sns.FacetGrid(data=to_plot, row='Attribute', hue='k6cls', sharey=False, sharex=False, aspect=2)
_ = facets.map(sns.kdeplot, 'Values', shade=True).add_legend()
In [42]:
SSE = []
for k in range(1,9):
    estimator = cluster.KMeans(n_clusters=k)
    estimator.fit(join[attribute])
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
plt.show()
plt.savefig
Out[42]:
<function matplotlib.pyplot.savefig(*args, **kwargs)>

Based on the elbow value, we should choose when k = 5, because after 5 the line seems to be linear decreasing.

We finally listed the map cluster groups here. There is no clear boundaries.

In [43]:
choro_cluster('k5cls','K=5')
In [44]:
choro_cluster('k3cls','K=3')
In [45]:
choro_cluster('k4cls','K=4')